# Always print this out before your assignment
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods  
## [7] base     
## 
## other attached packages:
## [1] knitr_1.36
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.28   R6_2.5.1        jsonlite_1.7.2  magrittr_2.0.1 
##  [5] evaluate_0.14   rlang_0.4.11    stringi_1.7.5   jquerylib_0.1.4
##  [9] bslib_0.3.1     rmarkdown_2.10  tools_4.1.1     stringr_1.4.0  
## [13] xfun_0.25       yaml_2.2.1      fastmap_1.1.0   compiler_4.1.1 
## [17] htmltools_0.5.2 sass_0.4.0
getwd()
## [1] "/Users/angpham/Desktop/CPSC_Courses/MGSC310/final_project"
# Load all your libraries in this chunk 
library('tidyverse')
library('dplyr')
library('ggplot2')
library('ggridges')
# NOTE: Do not run install.packages() inside a code chunk. Install them in the console outside of a code chunk. 
tracks <- read.csv(here::here("spotify_dataset", "spotify_tracks.csv"))
  1. Description of Features

The follwing link details each of the features in the ‘tracks’ dataset: https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-tracks)

tracks %>% glimpse()
## Rows: 101,939
## Columns: 32
## $ X                 <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ acousticness      <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, …
## $ album_id          <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9…
## $ analysis_url      <chr> "https://api.spotify.com/v1/audio-analysis…
## $ artists_id        <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6C…
## $ available_markets <chr> "['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG'…
## $ country           <chr> "BE", "BE", "BE", "BE", "BE", "BE", "BE", …
## $ danceability      <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, …
## $ disc_number       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ duration_ms       <dbl> 235584, 656960, 492840, 316578, 558880, 18…
## $ energy            <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.…
## $ href              <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ id                <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLS…
## $ instrumentalness  <dbl> 0.00000269, 0.00000000, 0.00000000, 0.0000…
## $ key               <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, …
## $ liveness          <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.…
## $ loudness          <dbl> -7.447, -10.340, -13.605, -20.254, -13.749…
## $ lyrics            <chr> "\n\nPerhaps I am bound to be restless\nAl…
## $ mode              <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ name              <chr> "Blood", "The Ugly Duckling", "Jimmy Launc…
## $ playlist          <chr> "Hipsteribrunssi", "Animal Stories", "Best…
## $ popularity        <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47,…
## $ preview_url       <chr> "https://p.scdn.co/mp3-preview/1b05a902da3…
## $ speechiness       <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.…
## $ tempo             <dbl> 115.018, 115.075, 79.565, 112.822, 81.260,…
## $ time_signature    <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, …
## $ track_href        <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ track_name_prev   <chr> "track_14", "track_3", "track_4", "track_9…
## $ track_number      <dbl> 1, 3, 4, 1, 2, 8, 2, 11, 6, 6, 1, 3, 12, 5…
## $ uri               <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "s…
## $ valence           <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.…
## $ type              <chr> "track", "track", "track", "track", "track…

Below is a summary of the relevant features that will be used for this question. This includes genral information about a track, popularity, and its audio features:
- id: ID of th track, as identified by Spotify
- name: Name of track
- href: Link to Spotify API for complete information about a track
- uri: Unique identifier to access track on Spotify
- artists_id: List of artists IDs for a track
- album_id: Album ID of the album the track is a part of
- duration_ms: Length of track
- popularity: Popularity score for a track based on Spotify algorithms (Spotify indicates that it is largely based on number of plays.)
- acousticness: Confidence measure from 0.0 to 1.0 of whether the track is acoustic
- danceability: How suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity
- energy: Measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity
- instrumentalness: Predicts whether a track contains no vocals
- key: Key the track is in. Integers map to pitches using standard Pitch Class notation
- liveness: Detects the presence of an audience in the recording
- loudness: Quality of a sound that is the primary psychological correlate of physical strength (amplitude), measured in decibels (dB)
- mode: Modality (major or minor) of a track
- speechiness: Presence of spoken words in a track
- tempo: Speed or pace of a given piece in beats per minute (BPM)
- time_signature: Notational convention to specify how many beats are in each bar (or measure)
- valence: Musical positiveness conveyed by a track

tracks_clean <- na.omit(tracks) # Omit any tracks with an na value in any column

tracks_clean <- tracks[, c("id", "name", "href", "uri", "artists_id", "album_id", "duration_ms", "popularity", "acousticness", "danceability", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature", "valence")]

tracks_clean %>% glimpse()
## Rows: 101,939
## Columns: 20
## $ id               <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLSU…
## $ name             <chr> "Blood", "The Ugly Duckling", "Jimmy Launch…
## $ href             <chr> "https://api.spotify.com/v1/tracks/5qljLQuK…
## $ uri              <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "sp…
## $ artists_id       <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6CY…
## $ album_id         <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9Y…
## $ duration_ms      <dbl> 235584, 656960, 492840, 316578, 558880, 183…
## $ popularity       <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47, …
## $ acousticness     <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, 0…
## $ danceability     <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, 0…
## $ energy           <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.3…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.00000…
## $ key              <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, 5…
## $ liveness         <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.6…
## $ loudness         <dbl> -7.447, -10.340, -13.605, -20.254, -13.749,…
## $ mode             <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1…
## $ speechiness      <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.0…
## $ tempo            <dbl> 115.018, 115.075, 79.565, 112.822, 81.260, …
## $ time_signature   <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4…
## $ valence          <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.1…
summary(tracks_clean)
##       id                name               href          
##  Length:101939      Length:101939      Length:101939     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      uri             artists_id          album_id        
##  Length:101939      Length:101939      Length:101939     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   duration_ms        popularity     acousticness     danceability  
##  Min.   :   1155   Min.   : 0.00   Min.   :0.0000   Min.   :0.000  
##  1st Qu.: 184000   1st Qu.:29.00   1st Qu.:0.0407   1st Qu.:0.480  
##  Median : 216893   Median :41.00   Median :0.2380   Median :0.610  
##  Mean   : 246771   Mean   :39.78   Mean   :0.3521   Mean   :0.586  
##  3rd Qu.: 261055   3rd Qu.:52.00   3rd Qu.:0.6450   3rd Qu.:0.714  
##  Max.   :5505831   Max.   :97.00   Max.   :0.9960   Max.   :0.989  
##      energy       instrumentalness         key        
##  Min.   :0.0000   Min.   :0.0000000   Min.   : 0.000  
##  1st Qu.:0.4110   1st Qu.:0.0000000   1st Qu.: 2.000  
##  Median :0.6290   Median :0.0000375   Median : 5.000  
##  Mean   :0.5865   Mean   :0.1487759   Mean   : 5.271  
##  3rd Qu.:0.7980   3rd Qu.:0.0344000   3rd Qu.: 8.000  
##  Max.   :1.0000   Max.   :1.0000000   Max.   :11.000  
##     liveness         loudness            mode         speechiness    
##  Min.   :0.0000   Min.   :-60.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0956   1st Qu.:-11.149   1st Qu.:0.0000   1st Qu.:0.0364  
##  Median :0.1240   Median : -7.599   Median :1.0000   Median :0.0506  
##  Mean   :0.1976   Mean   : -9.463   Mean   :0.6182   Mean   :0.1288  
##  3rd Qu.:0.2410   3rd Qu.: -5.509   3rd Qu.:1.0000   3rd Qu.:0.1040  
##  Max.   :0.9990   Max.   :  2.719   Max.   :1.0000   Max.   :0.9690  
##      tempo        time_signature     valence      
##  Min.   :  0.00   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 95.97   1st Qu.:4.000   1st Qu.:0.2710  
##  Median :118.07   Median :4.000   Median :0.4770  
##  Mean   :118.36   Mean   :3.876   Mean   :0.4828  
##  3rd Qu.:136.04   3rd Qu.:4.000   3rd Qu.:0.6930  
##  Max.   :244.03   Max.   :5.000   Max.   :0.9930

Below is a histogram showing the spread of the ‘popularity’ variable. From the plot below, we can see that the ‘popularity’ variable is normally distributed from 0 - 100, with 100 indicating most popular. The ‘popularity’ varaible is not ordered by rank. Spotify indicates it largely based on number of plays, but includes other factors as well.

ggplot(tracks_clean, aes(tracks_clean$popularity)) + geom_histogram(binwidth = 5) +
      theme_minimal()

The following plots show relationships between the popularity scores and each audio feature. The plots are not linear; however, some plots, like danceability and energy clearly show that higher values in those audio features indicate more popular music, as indicated by darker points in the top right hand corner.

ggplot(tracks_clean, aes(acousticness, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(danceability, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(energy, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(instrumentalness, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(key, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(liveness, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(loudness, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(mode, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(speechiness, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(tempo, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(time_signature, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()


ggplot(tracks_clean, aes(valence, popularity)) + geom_point(alpha = 0.05) + 
      geom_smooth() +
      theme_minimal()

The boxplot below shows a clearer picture of the distribution of the ‘popularity’ variable. Our group has decided that any scores above the 75th percentile will be considered a hit. The 75th percentile is at the popularity score of 52.00, as indicated by the next code chunk.

ggplot(tracks_clean, aes(popularity)) + geom_boxplot() +
  theme_minimal()

percentile_75 <- quantile(tracks_clean$popularity, prob = 0.75)
sprintf("75th Percentile = %i", as.numeric(percentile_75))
## [1] "75th Percentile = 52"
# Add binary 'hit' variable to 'tracks_clean' dataset
tracks_hit <- tracks_clean %>% mutate(hit = ifelse(popularity > percentile_75, 1, 0) %>% factor(., levels = c("0","1")),)
tracks_hit %>% glimpse()
## Rows: 101,939
## Columns: 21
## $ id               <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLSU…
## $ name             <chr> "Blood", "The Ugly Duckling", "Jimmy Launch…
## $ href             <chr> "https://api.spotify.com/v1/tracks/5qljLQuK…
## $ uri              <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "sp…
## $ artists_id       <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6CY…
## $ album_id         <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9Y…
## $ duration_ms      <dbl> 235584, 656960, 492840, 316578, 558880, 183…
## $ popularity       <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47, …
## $ acousticness     <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, 0…
## $ danceability     <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, 0…
## $ energy           <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.3…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.00000…
## $ key              <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, 5…
## $ liveness         <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.6…
## $ loudness         <dbl> -7.447, -10.340, -13.605, -20.254, -13.749,…
## $ mode             <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1…
## $ speechiness      <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.0…
## $ tempo            <dbl> 115.018, 115.075, 79.565, 112.822, 81.260, …
## $ time_signature   <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4…
## $ valence          <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.1…
## $ hit              <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

The following plots show relationships between the popularity scores and each audio feature. Most of the boxplots for each audio feature look the same whether they are a hit or not, indicating they may not be the strongest audio features in determining popularity. However, the difference seen in the ‘dancebility’ and ‘energy’ box plots may indicate that they may be strong features in determing popularity.

ggplot(tracks_hit, aes(x = hit, y = acousticness, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = danceability, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = energy, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = instrumentalness, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = key, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = liveness, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = loudness, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = mode, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = speechiness, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = tempo, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = time_signature, fill = popularity)) + geom_boxplot() +
  theme_minimal()


ggplot(tracks_hit, aes(x = hit, y = valence, fill = popularity)) + geom_boxplot() +
  theme_minimal()

artist_genres <- read.csv(here::here("spotify_dataset", "artist_genres.csv"))
artist_genres_clean <- na.omit(artist_genres) # Omit any tracks with an na value in any column
artist_genres_clean %>% glimpse()
## Rows: 34,680
## Columns: 11
## $ Artist       <chr> "Juliano Cezar", "The Grenadines", "Gangway", "…
## $ ID           <chr> "4mGnpjhqgx4RUdsIJiURdo", "1dLnVku4VQUOLswwDFvR…
## $ Genre        <chr> "other", "rock", "pop", "pop", "other", "electr…
## $ acousticness <dbl> 0.2651667, 0.0331000, 0.3940000, 0.0122000, 0.9…
## $ danceability <dbl> 0.4486667, 0.5150000, 0.7010000, 0.3480000, 0.3…
## $ energy       <dbl> 0.6696667, 0.3730000, 0.6260000, 0.5400000, 0.1…
## $ loudness     <dbl> -4.549000, -9.872000, -11.246000, -8.051000, -1…
## $ liveness     <dbl> 0.2873333, 0.0822000, 0.0813000, 0.1110000, 0.1…
## $ speechiness  <dbl> 0.0345000, 0.0241000, 0.0268000, 0.0415000, 0.0…
## $ tempo        <dbl> 100.22567, 95.00500, 97.98800, 103.56500, 151.9…
## $ valence      <dbl> 0.3743333, 0.2460000, 0.9350000, 0.1620000, 0.5…
summary(artist_genres_clean)
##     Artist               ID               Genre          
##  Length:34680       Length:34680       Length:34680      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   acousticness     danceability        energy          loudness      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :-57.436  
##  1st Qu.:0.0462   1st Qu.:0.4634   1st Qu.:0.4450   1st Qu.:-10.815  
##  Median :0.2298   Median :0.5990   Median :0.6450   Median : -7.616  
##  Mean   :0.3415   Mean   :0.5744   Mean   :0.5987   Mean   : -9.453  
##  3rd Qu.:0.5930   3rd Qu.:0.7090   3rd Qu.:0.7980   3rd Qu.: -5.652  
##  Max.   :0.9960   Max.   :0.9840   Max.   :1.0000   Max.   :  1.605  
##     liveness       speechiness          tempo          valence      
##  Min.   :0.0000   Min.   :0.00000   Min.   :  0.0   Min.   :0.0000  
##  1st Qu.:0.1007   1st Qu.:0.03900   1st Qu.:101.6   1st Qu.:0.2780  
##  Median :0.1340   Median :0.05350   Median :119.6   Median :0.4749  
##  Mean   :0.1864   Mean   :0.09443   Mean   :119.4   Mean   :0.4751  
##  3rd Qu.:0.2232   3rd Qu.:0.09870   3rd Qu.:133.9   3rd Qu.:0.6690  
##  Max.   :0.9990   Max.   :0.96300   Max.   :244.0   Max.   :0.9920

The density ridges for the each audio features helps visualize how each genre sounds like.

ggplot(artist_genres_clean, aes(x = acousticness, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = danceability, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = energy, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = loudness, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = liveness, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = speechiness, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = tempo, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()


ggplot(artist_genres_clean, aes(x = valence, y = Genre)) +
  geom_density_ridges() +
  theme_minimal()